import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['bmh']) # bmh Styling used for Visulization
df = pd.read_csv("cleaned_car_buyers.csv")
print ('Data read into a pandas dataframe!')
Data read into a pandas dataframe!
df.head() #Calling first 5 rows
Manufacturer | Model | Price | Transmission | Power | Engine_CC | Fuel | Male | Female | Unknown | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Renault | Clio | 22.10000 | 5 | 75 | 1219 | petrol | 241287 | 312556 | 28004 | 581847 |
1 | BMW | 320i | 47.84837 | 6 | 126 | 1995 | petrol | 408016 | 115843 | 29125 | 552984 |
2 | Volkswagen | Polo | 18.19250 | 5 | 60 | 1408 | petrol | 216333 | 299110 | 31701 | 547144 |
3 | Peugeot | 206 | 20.03375 | 4 | 71 | 1631 | petrol | 178698 | 250614 | 26135 | 455447 |
4 | Ford | Mondeo | 39.97375 | 1 | 130 | 1998 | petrol | 357452 | 69603 | 16550 | 443605 |
df.sort_values(by=['Total'], inplace=True, ascending=False)
df
Manufacturer | Model | Price | Transmission | Power | Engine_CC | Fuel | Male | Female | Unknown | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|
5129 | Ford | Fiesta | 18.532143 | 5 | 68 | 1166 | petrol | 730276 | 789633 | 41603 | 1561512 |
5353 | Ford | Fiesta | 18.532143 | 5 | 68 | 1166 | petrol | 737836 | 782479 | 39028 | 1559343 |
4889 | Ford | Fiesta | 18.532143 | 5 | 68 | 1166 | petrol | 710871 | 781033 | 44195 | 1536099 |
5354 | Ford | Escort | 16.208744 | 4 | 63 | 1650 | petrol | 989746 | 501907 | 35418 | 1527071 |
5765 | Ford | Escort | 16.208744 | 4 | 63 | 1650 | petrol | 1007113 | 489732 | 29926 | 1526771 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4888 | Opel | Ascona | 15.118118 | 4 | 61 | 1603 | petrol | 795 | 185 | 24 | 1004 |
3214 | Lada | Samara | 10.915800 | 5 | 49 | 1372 | petrol | 773 | 212 | 19 | 1004 |
1124 | Skoda | Favorit | 7.863333 | 5 | 44 | 1289 | petrol | 685 | 281 | 38 | 1004 |
1860 | Renault | 21 | 20.353789 | 4 | 67 | 1996 | petrol | 784 | 178 | 41 | 1003 |
746 | Mini | Roadster | 32.658364 | 6 | 101 | 1424 | petrol | 291 | 628 | 82 | 1001 |
5949 rows × 11 columns
df.dtypes #Checking Datatypes
Manufacturer object Model object Price float64 Transmission int64 Power int64 Engine_CC int64 Fuel object Male int64 Female int64 Unknown int64 Total int64 dtype: object
df.describe() #Understanding the data
Price | Transmission | Power | Engine_CC | Male | Female | Unknown | Total | |
---|---|---|---|---|---|---|---|---|
count | 5949.000000 | 5949.000000 | 5949.000000 | 5949.000000 | 5.949000e+03 | 5949.000000 | 5949.000000 | 5.949000e+03 |
mean | 41.087399 | 4.854597 | 110.448311 | 1992.979660 | 3.312574e+04 | 20087.933602 | 2461.825181 | 5.567482e+04 |
std | 51.836351 | 1.241327 | 67.076695 | 806.659888 | 8.098990e+04 | 60387.333760 | 5739.449368 | 1.406078e+05 |
min | 3.332200 | 0.000000 | 15.000000 | 469.000000 | 2.910000e+02 | 30.000000 | 13.000000 | 1.001000e+03 |
25% | 18.198429 | 4.000000 | 70.000000 | 1498.000000 | 2.804000e+03 | 881.000000 | 167.000000 | 3.958000e+03 |
50% | 28.960200 | 5.000000 | 94.000000 | 1833.000000 | 8.180000e+03 | 3409.000000 | 561.000000 | 1.293900e+04 |
75% | 43.548516 | 6.000000 | 121.000000 | 2247.000000 | 2.567300e+04 | 12287.000000 | 1964.000000 | 4.127100e+04 |
max | 646.605000 | 8.000000 | 450.000000 | 6208.000000 | 1.007113e+06 | 789633.000000 | 55632.000000 | 1.561512e+06 |
df.info(verbose=False) #Information about the dataframe
<class 'pandas.core.frame.DataFrame'> Int64Index: 5949 entries, 5129 to 746 Columns: 11 entries, Manufacturer to Total dtypes: float64(1), int64(7), object(3) memory usage: 557.7+ KB
df.isnull().sum()
Manufacturer 0 Model 0 Price 0 Transmission 0 Power 0 Engine_CC 0 Fuel 0 Male 0 Female 0 Unknown 0 Total 0 dtype: int64
df.corr() #Checking Correlation between columns
Price | Transmission | Power | Engine_CC | Male | Female | Unknown | Total | |
---|---|---|---|---|---|---|---|---|
Price | 1.000000 | 0.424442 | 0.857374 | 0.770028 | -0.068462 | -0.100583 | -0.083540 | -0.086043 |
Transmission | 0.424442 | 1.000000 | 0.547033 | 0.292420 | -0.038833 | -0.047664 | -0.023994 | -0.043816 |
Power | 0.857374 | 0.547033 | 1.000000 | 0.870942 | -0.084845 | -0.142175 | -0.109509 | -0.114402 |
Engine_CC | 0.770028 | 0.292420 | 0.870942 | 1.000000 | -0.114367 | -0.182725 | -0.162236 | -0.150975 |
Male | -0.068462 | -0.038833 | -0.084845 | -0.114367 | 1.000000 | 0.825118 | 0.883884 | 0.966446 |
Female | -0.100583 | -0.047664 | -0.142175 | -0.182725 | 0.825118 | 1.000000 | 0.921717 | 0.942365 |
Unknown | -0.083540 | -0.023994 | -0.109509 | -0.162236 | 0.883884 | 0.921717 | 1.000000 | 0.945789 |
Total | -0.086043 | -0.043816 | -0.114402 | -0.150975 | 0.966446 | 0.942365 | 0.945789 | 1.000000 |
To view the dimensions of the dataframe, we use the .shape parameter.
df.shape # size of dataframe (rows, columns)
(5949, 11)
df1 = df[['Total','Male','Female','Unknown','Manufacturer','Model','Price']] #creating new dataframe
df1 = df1.groupby(['Model'],as_index=False).max() #Using groupby function to create required pivot table
df1
Model | Total | Male | Female | Unknown | Manufacturer | Price | |
---|---|---|---|---|---|---|---|
0 | 09-May | 44398 | 35025 | 7000 | 2374 | Saab | 52.957194 |
1 | 100 | 40026 | 33128 | 5716 | 1419 | Audi | 37.326360 |
2 | 1007 | 7130 | 3347 | 3486 | 417 | Peugeot | 18.657000 |
3 | 104 | 4637 | 2384 | 2115 | 138 | Peugeot | 7.508333 |
4 | 105 | 6512 | 4196 | 2209 | 107 | Skoda | 4.626333 |
... | ... | ... | ... | ... | ... | ... | ... |
480 | Z4 | 29553 | 19343 | 8462 | 1748 | BMW | 64.109625 |
481 | ZR | 48479 | 28381 | 18426 | 1736 | MG | 21.023333 |
482 | ZT | 17280 | 14496 | 2287 | 513 | MG | 42.746667 |
483 | ZX | 128499 | 81329 | 42312 | 5475 | Citroen | 16.056769 |
484 | Zafira | 1259 | 762 | 415 | 116 | Opel | 35.270000 |
485 rows × 7 columns
df1.sort_values(by=['Total'], inplace=True, ascending=False) #sorting values in decending order to move top ten Ownership to top
df1
Model | Total | Male | Female | Unknown | Manufacturer | Price | |
---|---|---|---|---|---|---|---|
213 | Fiesta | 1561512 | 737836 | 789633 | 52404 | Ford | 18.532143 |
202 | Escort | 1527071 | 1007113 | 501907 | 38200 | Ford | 16.208744 |
214 | Focus | 1273125 | 796190 | 421303 | 55632 | Ford | 30.619322 |
393 | Sierra | 946240 | 781210 | 150811 | 14367 | Ford | 18.295562 |
228 | Golf | 824931 | 471514 | 306459 | 46958 | Volkswagen | 31.242154 |
... | ... | ... | ... | ... | ... | ... | ... |
460 | X-90 | 1147 | 417 | 712 | 64 | Suzuki | 16.641000 |
449 | Vel | 1132 | 932 | 161 | 52 | Renault | 51.641562 |
188 | DS5 | 1081 | 882 | 160 | 39 | Citroen | 42.557333 |
258 | Korando | 1080 | 786 | 259 | 35 | Ssangyong | 37.904000 |
232 | Griffith | 1078 | 972 | 73 | 43 | TVR | 71.187000 |
485 rows × 7 columns
df1 = df1.head(10) #Filtering first 10 rows
df1
Model | Total | Male | Female | Unknown | Manufacturer | Price | |
---|---|---|---|---|---|---|---|
213 | Fiesta | 1561512 | 737836 | 789633 | 52404 | Ford | 18.532143 |
202 | Escort | 1527071 | 1007113 | 501907 | 38200 | Ford | 16.208744 |
214 | Focus | 1273125 | 796190 | 421303 | 55632 | Ford | 30.619322 |
393 | Sierra | 946240 | 781210 | 150811 | 14367 | Ford | 18.295562 |
228 | Golf | 824931 | 471514 | 306459 | 46958 | Volkswagen | 31.242154 |
297 | Mondeo | 759309 | 594469 | 138468 | 26609 | Ford | 39.973750 |
169 | Clio | 693039 | 269970 | 390458 | 32611 | Renault | 22.100000 |
30 | 2000 | 652975 | 445497 | 193015 | 19517 | Rover | 17.182000 |
33 | 206 | 568218 | 199330 | 339260 | 30018 | Peugeot | 20.033750 |
48 | 320i | 567097 | 408016 | 131916 | 29801 | BMW | 47.848370 |
# Creating the Bar Graph to show Top 10 Vehicles Ownership by gender
df1.plot(x="Model", y=["Male", "Female", "Unknown"], kind="barh",title="Top 10 Vehicles Ownership by Gender",xlabel = "Car Vehicles",figsize=(10,10))
<AxesSubplot:title={'center':'Top 10 Vehicles Ownership by Gender'}, ylabel='Car Vehicles'>
df2 = df[['Manufacturer','Model','Price','Power']] #creating new dataframe
df2
Manufacturer | Model | Price | Power | |
---|---|---|---|---|
5129 | Ford | Fiesta | 18.532143 | 68 |
5353 | Ford | Fiesta | 18.532143 | 68 |
4889 | Ford | Fiesta | 18.532143 | 68 |
5354 | Ford | Escort | 16.208744 | 63 |
5765 | Ford | Escort | 16.208744 | 63 |
... | ... | ... | ... | ... |
4888 | Opel | Ascona | 15.118118 | 61 |
3214 | Lada | Samara | 10.915800 | 49 |
1124 | Skoda | Favorit | 7.863333 | 44 |
1860 | Renault | 21 | 20.353789 | 67 |
746 | Mini | Roadster | 32.658364 | 101 |
5949 rows × 4 columns
sns.displot(df2['Price'], color="r")
<seaborn.axisgrid.FacetGrid at 0x20980bebee0>
# Let's declare a variable that will be equal to the 99th percentile of the 'Price' variable
q = df2['Price'].quantile(0.99)
# Then we create a new df, with the condition that all prices must be below the 99 percentile of 'Price'
df3 = df2[df2['Price']<q]
# In this way we have essentially removed the top 1% of the data about 'Price'
df3.describe(include='all')
Manufacturer | Model | Price | Power | |
---|---|---|---|---|
count | 5883 | 5883 | 5883.000000 | 5883.000000 |
unique | 51 | 480 | NaN | NaN |
top | Ford | Space | NaN | NaN |
freq | 329 | 49 | NaN | NaN |
mean | NaN | NaN | 37.101089 | 106.933197 |
std | NaN | NaN | 31.143678 | 58.569440 |
min | NaN | NaN | 3.332200 | 15.000000 |
25% | NaN | NaN | 18.192500 | 70.000000 |
50% | NaN | NaN | 28.608205 | 94.000000 |
75% | NaN | NaN | 43.068421 | 120.000000 |
max | NaN | NaN | 260.287000 | 433.000000 |
# We can check the PDF once again to ensure that the result is still distributed in the same way overall
# however, there are much fewer outliers
sns.displot(df3['Price'],color="g")
<seaborn.axisgrid.FacetGrid at 0x20980da7550>
sns.displot(df2['Power'])
<seaborn.axisgrid.FacetGrid at 0x209806a52b0>
# Let's declare a variable that will be equal to the 99th percentile of the 'Power' variable
q = df3['Power'].quantile(0.99)
# Then we can create a new df, with the condition that all prices must be below the 99 percentile of 'Power'
df4 = df3[df3['Power']<q]
# In this way we have essentially removed the top 1% of the data about 'Power'
df4.describe(include='all')
Manufacturer | Model | Price | Power | |
---|---|---|---|---|
count | 5824 | 5824 | 5824.000000 | 5824.000000 |
unique | 50 | 471 | NaN | NaN |
top | Ford | Space | NaN | NaN |
freq | 329 | 49 | NaN | NaN |
mean | NaN | NaN | 35.718906 | 104.222356 |
std | NaN | NaN | 27.787608 | 52.158339 |
min | NaN | NaN | 3.332200 | 15.000000 |
25% | NaN | NaN | 18.167154 | 69.000000 |
50% | NaN | NaN | 28.379000 | 93.000000 |
75% | NaN | NaN | 42.341317 | 118.000000 |
max | NaN | NaN | 200.198250 | 316.000000 |
# We check the PDF once again to ensure that the result is still distributed in the same way overall
# however, there are much fewer outliers
sns.displot(df4['Power'],color="g")
<seaborn.axisgrid.FacetGrid at 0x209807dfbb0>
#Checking correlation of Male with other features
df[df.columns[1:]].corr()['Male'][:]
Price -0.068462 Transmission -0.038833 Power -0.084845 Engine_CC -0.114367 Male 1.000000 Female 0.825118 Unknown 0.883884 Total 0.966446 Name: Male, dtype: float64
f, ax = plt.subplots(figsize=(12, 5))
c = sns.regplot(x="Male", y="Total", data=df, ax=ax)
plt.ylim(0,)
c.set_title('Total Correlation with Male', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Total Correlation with Male')
f, ax = plt.subplots(figsize=(12, 5))
c1 = sns.regplot(x="Male", y="Transmission", data=df, ax=ax)
plt.ylim(0,)
c1.set_title('Transmission Correlation with Male', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Transmission Correlation with Male')
f, ax = plt.subplots(figsize=(12, 5))
c2 = sns.regplot(x="Male", y="Price", data=df, ax=ax)
plt.ylim(0,)
c2.set_title('Price Correlation with Male', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Price Correlation with Male')
plt.figure(figsize=(4, 10))
heatmap = sns.heatmap(df[df.columns[1:]].corr()[['Male']][1:].sort_values(by='Male', ascending=False), vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Features and their Correlation with Male Column', fontdict={'fontsize':18}, pad=16)
plt.savefig('Male.png', dpi=300, bbox_inches='tight')